Visualize all Observations by State & Species

data(us.cities)

# Get major cities for each sample region (state)
.states <- c("OR", "VT", "CO", "NC")
top.cities <- purrr::map_df(.states, function(s) {
  out <- us.cities %>% 
  filter(country.etc==s) %>%
  mutate(city = gsub(paste0(" ", s), "", name)) %>%
  arrange(-pop)
  if (s == "OR") {
    out <- out %>% 
      head() %>%
      filter(!(city %in% c("Gresham", "Hillsboro", "Corvallis",
                           "Beaverton", "Springfield")))
  } else if (s == "CO") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Thornton", "Lakewood", "Aurora")))
  } else if (s == "NC") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Greensboro", "Durham", "Fayetteville")))
  } else {
    out <- out %>% head()
  }
  out
})

# Load the map data
states <- map_data("state") %>% 
  filter(region %in% c("oregon", "north carolina", "colorado", "vermont"))

# Load your data
data.files <- list.files("data/final", full.names = T)

df <- purrr::map_df(data.files, readRDS) 

caps.after.ws <- function(string) {
  gsub("(?<=\\s)([a-z])", "\\U\\1", string, perl = T)
}

# Define a function to create a plot for each species
plot.for.species <- function(spec, st.abbr) {
  st <- case_when(st.abbr == "CO" ~ "colorado",
                  st.abbr == "NC" ~ "north carolina",
                  st.abbr == "VT" ~ "vermont",
                  st.abbr == "OR" ~ "oregon",
                  T ~ "")
  
  title <- caps.after.ws(paste(st.abbr, gsub("_", " ", spec), 
                             "Observations, 2016-2019"))
  
  p <- ggplot(data = states %>% filter(region == st)) +
    geom_polygon(aes(x = long, y = lat, group = group),
                 fill = "#989875", color = "black") +
    geom_point(data = df %>% filter(state == st.abbr & common.name == spec), 
               aes(x = lon, y = lat), 
               size=1, alpha=.5, fill = "red", shape=21) +
    geom_point(data = top.cities %>% filter(country.etc == st.abbr), 
               aes(x=long, y=lat),
               fill="gold", color="black", size=3.5, shape = 21) + 
    geom_text(data = top.cities %>% filter(country.etc == st.abbr), 
              aes(x=long, y=lat, label=city),
              color="white", hjust=case_when(st.abbr=="NC"~.2,
                                               st.abbr=="VT"~.65,
                                               T~.5),
              vjust=ifelse(st.abbr=="NC", -.65, 1.5),
              size=4) + 
    coord_map() +
    ggtitle(title) +
    theme_minimal() +
    theme(panel.background = element_blank(),
          axis.text = element_blank(),
          axis.title = element_blank(),
          axis.ticks = element_blank(),
          panel.grid = element_blank())

  data.table(
    state=st.abbr,
    species=spec,
    plot=list(p)
  )
}

spec.state <- expand.grid(unique(df$common.name), unique(df$state)) %>%
  rename(spec=Var1, st.abbr=Var2) 

# Create a list of plots
plots <- purrr::map2_df(spec.state$spec, 
                        spec.state$st.abbr, 
                        ~plot.for.species(.x, .y))
# Plot Ruddy Duck plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Ruddy Duck"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Belted Kingfisher plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Belted Kingfisher"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Wild Turkey plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Wild Turkey"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sharp-Shinned Hawk plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sharp-shinned Hawk"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Downy Woodpecker Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Downy Woodpecker"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Cedar Waxwing Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Cedar Waxwing"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sandhill Crane Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sandhill Crane"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sanderling Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sanderling"]$plot, 
          list(nrow=2, ncol=2)))

Explore Explanatory Rasters

states <- c("CO", "NC", "OR", "VT")
r.files <- paste0("data/final_rasters/", states, ".tif")
r.list <- purrr::map(r.files, rast)
names(r.list) <- states

TODO: - terra::freq - terra::density - terra::layerCor

Principal Component Analysis

r.df <- map_df(states, function(s) {
  df <- r.list[[s]] %>% as.data.frame()
  names(df) <- names(df) %>% gsub(paste0("_", s), "", .)
  df %>% setDT()
  df[, state := factor(s, levels=states)]
  df[apply(df, 1, function(.x) !any(is.na(.x)))]
}) 

# Custom function to process factor levels
clean.levels <- function(x) {
  # Remove non-alphanumeric characters and replace with underscores
  x <- gsub("[^a-zA-Z0-9]", "_", x)
  # Convert to lowercase
  x <- tolower(x)
  # Remove any leading or trailing underscores
  x <- gsub("^_|_$", "", x)
  x <- gsub("__", "_", x)
  x <- gsub("NLCD_Land_", "", x)
  return(x)
}

r.df[, NLCD_Land := factor(clean.levels(levels(NLCD_Land))[NLCD_Land])]

# Convert factor columns to dummy variables
df.dummies <- data.table(model.matrix(~ . - 1, data = r.df[, .(NLCD_Land, state)])) %>%
  cbind(r.df[, -which(names(r.df) %in% c("NLCD_Land", "state")), with=F]) 

names(df.dummies) <- gsub("NLCD_Land", "", names(df.dummies))

# Ensure that there is more than one value per column (remove otherwise)
uniq.1 <- t( df.dummies[, lapply(.SD, uniqueN)]) %>%
  as.data.frame() %>%
  filter(V1 == 1) %>%
  row.names()

if (length(uniq.1) >= 1) {
  df.dummies <- df.dummies[, -which(names(df.dummies) %in% uniq.1), with=F]
}


pca.fit <- PCA(df.dummies, graph=F)
plot.PCA(pca.fit, choix="var")

res <- pca.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Factor Analysis for Mixed Data

famd.fit <- FAMD(r.df, graph=F)

ggpubr::ggarrange(plotlist=purrr::map(
  c("var", "quanti", "quali"), 
  ~plot.FAMD(famd.fit, choix=.x)),
  ncol=1, nrow=3)

res <- famd.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Pseudo-Absence Generation

# Set up output directory
output.dir <- "artifacts/masks_5k"
if (!dir.exists("artifacts")) dir.create("artifacts")
if (!dir.exists(output.dir)) dir.create(output.dir)

# LOAD DATA 

# Get raster data
states <- c("CO", "NC", "OR", "VT")
r.list <- purrr::map(paste0("data/final_rasters/", states, ".tif"), rast)
names(r.list) <- states

# Get observation data
obs.df <- list.files("data/final", full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observation.point=geometry)

# BUFFERING RASTER DATA 

dist <- 5e3

mask.update <- function(i, mask.raster, obs.df, obs.field="observation.point",
                        dist=10000, u="m") {
  obs.pt <- st_transform(obs.df[i, "observation.point"], st_crs(mask.raster))
  # Create a buffer around the point, ensuring correct units
  buf <- st_buffer(obs.pt, dist=units::as_units(paste(dist, u)))
  return(terra::rasterize(buf, mask.raster, update=T, field=1))
}

# For each observation point, you can now create a distance 
# raster and then mask cells within the buffer distance
get.buffered.zones <- function(r, obs.df, obs.field="observation.point",
                               dist=10000, u="m") {
  # Create an empty raster with the same extent and resolution as r
  mask.raster <- terra::rast(r)
  # Recursively update mask.raster with additional buffered regions
  for(i in 1:nrow(obs.df)) {
    mask.raster <- mask.update(i, mask.raster, obs.df, obs.field=obs.field, dist=dist, u=u)
    gc()
  }
  return(mask.raster)
}

# Get masks by state, species
masks <- purrr::map(states, function(state) {
    specs <- sort(unique(obs.df$common.name))
    spec.masks <- purrr::map(specs, function(spec, st=state) {
      fname <- file.path(output.dir, paste0(st, "_", spec, ".tif"))
      if (file.exists(fname)) {
        cat("Reading", spec, st, "mask from", fname, "\n")
        r.mask <- rast(fname)
      } else {
        cat("Computing", spec, st, "mask, and saving to", fname, "\n")
        r.mask <- get.buffered.zones(r=r.list[[st]], 
                                     obs.df=filter(obs.df, state == st & common.name == spec),
                                     dist=dist)
        terra::writeRaster(r.mask, fname, overwrite=T)
      }
      gc()
      r.mask
    }, .progress=T)
    names(spec.masks) <- specs
    spec.masks
  })
## Reading Belted Kingfisher CO mask from artifacts/masks_5k/CO_Belted Kingfisher.tif 
## Reading Cedar Waxwing CO mask from artifacts/masks_5k/CO_Cedar Waxwing.tif 
## Reading Downy Woodpecker CO mask from artifacts/masks_5k/CO_Downy Woodpecker.tif 
## Reading Ruddy Duck CO mask from artifacts/masks_5k/CO_Ruddy Duck.tif 
## Reading Sanderling CO mask from artifacts/masks_5k/CO_Sanderling.tif 
## Reading Sandhill Crane CO mask from artifacts/masks_5k/CO_Sandhill Crane.tif 
## Reading Sharp-shinned Hawk CO mask from artifacts/masks_5k/CO_Sharp-shinned Hawk.tif 
## Reading Wild Turkey CO mask from artifacts/masks_5k/CO_Wild Turkey.tif 
## Reading Belted Kingfisher NC mask from artifacts/masks_5k/NC_Belted Kingfisher.tif 
## Reading Cedar Waxwing NC mask from artifacts/masks_5k/NC_Cedar Waxwing.tif 
## Reading Downy Woodpecker NC mask from artifacts/masks_5k/NC_Downy Woodpecker.tif 
## Reading Ruddy Duck NC mask from artifacts/masks_5k/NC_Ruddy Duck.tif 
## Reading Sanderling NC mask from artifacts/masks_5k/NC_Sanderling.tif 
## Reading Sandhill Crane NC mask from artifacts/masks_5k/NC_Sandhill Crane.tif 
## Reading Sharp-shinned Hawk NC mask from artifacts/masks_5k/NC_Sharp-shinned Hawk.tif 
## Reading Wild Turkey NC mask from artifacts/masks_5k/NC_Wild Turkey.tif 
## Reading Belted Kingfisher OR mask from artifacts/masks_5k/OR_Belted Kingfisher.tif 
## Reading Cedar Waxwing OR mask from artifacts/masks_5k/OR_Cedar Waxwing.tif 
## Reading Downy Woodpecker OR mask from artifacts/masks_5k/OR_Downy Woodpecker.tif 
## Reading Ruddy Duck OR mask from artifacts/masks_5k/OR_Ruddy Duck.tif 
## Reading Sanderling OR mask from artifacts/masks_5k/OR_Sanderling.tif 
## Reading Sandhill Crane OR mask from artifacts/masks_5k/OR_Sandhill Crane.tif 
## Reading Sharp-shinned Hawk OR mask from artifacts/masks_5k/OR_Sharp-shinned Hawk.tif 
## Reading Wild Turkey OR mask from artifacts/masks_5k/OR_Wild Turkey.tif 
## Reading Belted Kingfisher VT mask from artifacts/masks_5k/VT_Belted Kingfisher.tif 
## Reading Cedar Waxwing VT mask from artifacts/masks_5k/VT_Cedar Waxwing.tif 
## Reading Downy Woodpecker VT mask from artifacts/masks_5k/VT_Downy Woodpecker.tif 
## Reading Ruddy Duck VT mask from artifacts/masks_5k/VT_Ruddy Duck.tif 
## Reading Sanderling VT mask from artifacts/masks_5k/VT_Sanderling.tif 
## Reading Sandhill Crane VT mask from artifacts/masks_5k/VT_Sandhill Crane.tif 
## Reading Sharp-shinned Hawk VT mask from artifacts/masks_5k/VT_Sharp-shinned Hawk.tif 
## Reading Wild Turkey VT mask from artifacts/masks_5k/VT_Wild Turkey.tif
names(masks) <- states
# Function to sample n points from the non-masked parts
sample.inverse.mask <- function(r.original, r.mask, n, 
                                species, state,
                                sample.crs=4326, min.n=500,
                                output.dir="artifacts/pseudo_absence_regions") {
  if (!dir.exists(output.dir)) dir.create(output.dir)
  output.path <- file.path(output.dir,
                           gsub(" |\\-", "_", 
                                paste0(
                                  paste(state, tolower(species), sep="_"), 
                                  ".tif")
                           ))
  if (!file.exists(output.path)) {
    # Get inverse mask;
    # Set NA cells to 0, keep 0 cells as 0, change other cells to 1
    r.inverse <- terra::ifel(is.na(r.mask), 0, r.mask)
    # Set 0 to 1 and everything else to NA
    r.inverse <- terra::lapp(r.inverse, fun = function(x) ifelse(x == 0, 1, NA))
    # Crop so that anything outside of the state is NA
    r.cropped <- terra::crop(r.inverse, terra::ext(r.original))
    
    # Create a binary raster from r.original where valid values are 
    # set to 1 and NA values remain NA
    r.binary <- terra::lapp(r.original[[1]], fun = function(x) ifelse(!is.na(x), 1, NA))
    
    # Multiply the cropped raster by the binary raster to ensure 
    # outside values are set to NA
    r.final <- r.cropped * r.binary
    terra::writeRaster(r.final, output.path, overwrite=T)
  } else {
    r.final <- terra::rast(output.path)
  }
  
  # Convert the raster to SpatialPoints
  gdf <- terra::as.points(r.final) %>%
    st_as_sf() %>%
    st_transform(crs=sample.crs)
  if (nrow(gdf) > 0) {
    gdf <- gdf %>%
      filter(!is.na(layer)) %>%
      select(geometry)
  } else {
    return(gdf)
  }
    
  # Set to min.n size if n < min.n
  if (n < min.n) n <- min.n
  # Make sure there is sufficient available sample points
  if (n > nrow(gdf)) n <- nrow(gdf)
  
  # Randomly sample n points from the available (non-masked) space
  sample.idx <- sample(1:nrow(gdf), n)
  samples <- gdf %>% 
    slice(sample.idx) %>%
    mutate(common.name = species, 
           state = state, 
           lon = NA, 
           lat = NA,
           observations=0)
  
  # Populate lon and lat values:
  coords <- st_coordinates(samples)
  samples$lon <- coords[, "X"]
  samples$lat <- coords[, "Y"]
  
  return(samples)
}

# Get totals by species and state
totals <- obs.df %>%
  as_tibble() %>%
  select(state, common.name) %>%
  group_by(state, common.name) %>%
  summarize(N=n(), .groups="keep")

# Create a list of pseudo absence points, by species and state,
# where the sample number `n` >= 500 | `n` == the total observed
# for each respective state and species
pseudo.absence.pts <- list()
for (st in states) {
  r.original <- r.list[[st]]
  r.masks <- masks[[st]]
  pseudo.absence.pts[[st]] <- list()
  for (spec in names(r.masks)) {
    r.mask <- r.masks[[spec]]
    n <- totals %>% filter(state == st & common.name == spec) %>% pull(N)
    cat("Generating", n, "pseudo-absence points for the", spec, "in", st, "\n")
    pseudo.absence.pts[[st]][[spec]] <- sample.inverse.mask(
      r.original, r.mask, spec, st, n=n, sample.crs=4326)
    cat("\tGenerated", nrow(pseudo.absence.pts[[st]][[spec]]), "/", n, "points.\n")
  }
}
## Generating 4551 pseudo-absence points for the Belted Kingfisher in CO 
##  Generated 4551 / 4551 points.
## Generating 3446 pseudo-absence points for the Cedar Waxwing in CO 
##  Generated 3446 / 3446 points.
## Generating 7511 pseudo-absence points for the Downy Woodpecker in CO 
##  Generated 7511 / 7511 points.
## Generating 1726 pseudo-absence points for the Ruddy Duck in CO 
##  Generated 1726 / 1726 points.
## Generating 131 pseudo-absence points for the Sanderling in CO 
##  Generated 500 / 131 points.
## Generating 1532 pseudo-absence points for the Sandhill Crane in CO 
##  Generated 1532 / 1532 points.
## Generating 2257 pseudo-absence points for the Sharp-shinned Hawk in CO 
##  Generated 2257 / 2257 points.
## Generating 2611 pseudo-absence points for the Wild Turkey in CO 
##  Generated 2611 / 2611 points.
## Generating 4183 pseudo-absence points for the Belted Kingfisher in NC 
##  Generated 4183 / 4183 points.
## Generating 4195 pseudo-absence points for the Cedar Waxwing in NC 
##  Generated 4195 / 4195 points.
## Generating 10914 pseudo-absence points for the Downy Woodpecker in NC 
##  Generated 10914 / 10914 points.
## Generating 1106 pseudo-absence points for the Ruddy Duck in NC 
##  Generated 1106 / 1106 points.
## Generating 311 pseudo-absence points for the Sanderling in NC 
##  Generated 500 / 311 points.
## Generating 118 pseudo-absence points for the Sandhill Crane in NC 
##  Generated 500 / 118 points.
## Generating 1254 pseudo-absence points for the Sharp-shinned Hawk in NC 
##  Generated 1254 / 1254 points.
## Generating 2372 pseudo-absence points for the Wild Turkey in NC 
##  Generated 2372 / 2372 points.
## Generating 5837 pseudo-absence points for the Belted Kingfisher in OR 
##  Generated 5837 / 5837 points.
## Generating 8446 pseudo-absence points for the Cedar Waxwing in OR 
##  Generated 8446 / 8446 points.
## Generating 8576 pseudo-absence points for the Downy Woodpecker in OR 
##  Generated 8576 / 8576 points.
## Generating 2010 pseudo-absence points for the Ruddy Duck in OR 
##  Generated 2010 / 2010 points.
## Generating 258 pseudo-absence points for the Sanderling in OR 
##  Generated 500 / 258 points.
## Generating 2458 pseudo-absence points for the Sandhill Crane in OR 
##  Generated 2458 / 2458 points.
## Generating 2714 pseudo-absence points for the Sharp-shinned Hawk in OR 
##  Generated 2714 / 2714 points.
## Generating 2440 pseudo-absence points for the Wild Turkey in OR 
##  Generated 2440 / 2440 points.
## Generating 2033 pseudo-absence points for the Belted Kingfisher in VT 
##  Generated 2033 / 2033 points.
## Generating 3938 pseudo-absence points for the Cedar Waxwing in VT 
##  Generated 1169 / 3938 points.
## Generating 4635 pseudo-absence points for the Downy Woodpecker in VT 
##  Generated 1679 / 4635 points.
## Generating 51 pseudo-absence points for the Ruddy Duck in VT 
##  Generated 500 / 51 points.
## Generating 39 pseudo-absence points for the Sanderling in VT 
##  Generated 500 / 39 points.
## Generating 76 pseudo-absence points for the Sandhill Crane in VT 
##  Generated 500 / 76 points.
## Generating 748 pseudo-absence points for the Sharp-shinned Hawk in VT 
##  Generated 748 / 748 points.
## Generating 2181 pseudo-absence points for the Wild Turkey in VT 
##  Generated 2181 / 2181 points.
# Extract raster values for each point
if (!dir.exists(file.path("data", "final_pseudo_absence"))) {
  dir.create(file.path("data", "final_pseudo_absence"))
}
for (state in states) {
  out.file.all <- file.path("data", "final_pseudo_absence", paste0("pa_", state, ".rds"))
  if (!file.exists(out.file.all)) {
    r <- r.list[[state]]
    
    cat(sprintf("Extracting points to values for %s...\n", state))
    # Load observations shapefile
    geo.df <- pseudo.absence.pts[[state]] %>% do.call("rbind", .)
    rownames(geo.df) <- NULL
    
    geo.df.crs <- st_crs(geo.df)
    
    # Define target CRS and update
    target.crs <- st_crs(r)
    cat(sprintf("Updating CRS for %s...\n", state))
    geo.df <- st_transform(geo.df, target.crs)
    
    # Extract raster values
    for (r.name in names(r)) {
      cat("\tExtracting", r.name, "values for", state, "\n")
      x <- terra::extract(r[[r.name]], geo.df)[[r.name]]
      geo.df[[gsub(paste0("_", state), "", r.name)]] <- x
    }
    
    # Update crs back
    geo.df <- st_transform(geo.df, geo.df.crs)
    
    # Fix names; Filter NA values
    geo.df <- geo.df %>%
      filter(dplyr::if_all(names(.), ~!is.na(.))) %>%
      suppressWarnings() 
    
    saveRDS(geo.df, out.file.all)
    cat("--------------\n")
  }
}
# Get all pseudo-absence data
abs.df <- list.files(file.path("data", "final_pseudo_absence"), full.names = T) %>%
  purrr::map_df(readRDS) %>%
  select(state, common.name, observation.point=geometry)

# There might be some slight differences since there are occasionally NA values
abs.df %>%
  as_tibble() %>%
  select(state, common.name) %>%
  group_by(state, common.name) %>%
  summarize(psuedo.absence.N=n(), .groups="keep") %>%
  left_join(totals, by=c("state", "common.name")) %>%
  rename(observed.N = N) %>%
  knitr::kable()
state common.name psuedo.absence.N observed.N
CO Belted Kingfisher 4523 4551
CO Cedar Waxwing 3431 3446
CO Downy Woodpecker 7440 7511
CO Ruddy Duck 1715 1726
CO Sanderling 497 131
CO Sandhill Crane 1524 1532
CO Sharp-shinned Hawk 2241 2257
CO Wild Turkey 2597 2611
NC Belted Kingfisher 4042 4183
NC Cedar Waxwing 4059 4195
NC Downy Woodpecker 10415 10914
NC Ruddy Duck 1076 1106
NC Sanderling 488 311
NC Sandhill Crane 489 118
NC Sharp-shinned Hawk 1211 1254
NC Wild Turkey 2261 2372
OR Belted Kingfisher 5803 5837
OR Cedar Waxwing 8405 8446
OR Downy Woodpecker 8529 8576
OR Ruddy Duck 1996 2010
OR Sanderling 496 258
OR Sandhill Crane 2443 2458
OR Sharp-shinned Hawk 2696 2714
OR Wild Turkey 2429 2440
VT Belted Kingfisher 1956 2033
VT Cedar Waxwing 1098 3938
VT Downy Woodpecker 1598 4635
VT Ruddy Duck 490 51
VT Sanderling 493 39
VT Sandhill Crane 492 76
VT Sharp-shinned Hawk 730 748
VT Wild Turkey 2090 2181

Train/Test Splitting

TODO: Include pseudo-absence data

stratified.split.idx <- function(df, p=0.7, lat.lon.bins=25) {
  # Cut along lat/lon values to create grids (lat.bin & lon.bin)
  # lat.lon.bins is the number of divisions you want
  df$lat.bin <- cut(df$lat, breaks=lat.lon.bins, labels = F)
  df$lon.bin <- cut(df$lon, breaks=lat.lon.bins, labels = F)
  
  # Create a new variable combining the stratification variables
  df %>%
    mutate(strata = paste(lat.bin, lon.bin, common.name, state)) %>%
    pull(strata) %>%
    # Create the data partitions
    createDataPartition(., p = p, list = F) %>%
    suppressWarnings()
}

prepare.data <- function(df, p=.7, lat.lon.bins=25) {
  train.index <- stratified.split.idx(df, p=p, lat.lon.bins = lat.lon.bins)
  df.train <- df[train.index, ]
  df.test <- df[-train.index, ]
  
  list(train = df.train, 
       test = df.test,
       index = train.index)
}

train.test <- prepare.data(df, .7)
train <- df[train.test$index,]
test <- df[-train.test$index,]

EDA With Pseudo-Absence Data

Autocorrelation Mitigation

Feature Engineering

Land Cover Hierarchical Updates to Categories

Each of the 20 different Land Cover Categories falls under a “parent” category (see National Land Cover Database Class Legend and Description).

Feature Selection